Load Data and Preprocess

google.mobility <- read.csv('Data/Global_Mobility_Report.csv') %>%
  filter(country_region == 'United States')
confirmed.cases.data <- read.csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv")
# load packages

library(tidyr)
library(DataCombine)


# clean data

confirmed.cases.data2 <- confirmed.cases.data %>%
  dplyr::rename(state = Province_State,
                county = Admin2) %>% 
  dplyr::select(-UID, -iso2, -iso3, -code3,
                -Lat, -Long_, -Combined_Key, -Country_Region) %>% 
    gather(date, confirmed.cases,-state,-county,-FIPS) %>%
  dplyr::mutate(date = gsub('X', '', date), 
                date = as.Date(as.character(date), "%m.%d.%y"),
                county_state = paste0(county, ', ', state)) %>%
  arrange(county_state, date) %>%
  dplyr::mutate(lag = slide(.,
                                  Var = 'confirmed.cases', 
                                  NewVar = 'new',
                                  GroupVar = 'county_state', 
                                  slideBy = -1)[,'new'],
                new.cases = confirmed.cases - lag)
## 
## Remember to order . by county_state and the time variable before running.
## 
## Lagging confirmed.cases by 1 time units.
## calculate rolling averages and remove non-states

covid.smooth.data_county <- confirmed.cases.data2 %>%
    dplyr::group_by(county_state, date) %>% 
    dplyr::summarise(new.cases = sum(new.cases)) %>%
      dplyr::group_by(county_state) %>% 
    dplyr::mutate(cases_14days = zoo::rollmean(new.cases, 
                                               k = 14, fill = 0)) %>% 
  dplyr::ungroup() %>% 
  mutate() %>% 
  merge(confirmed.cases.data2 %>% 
          dplyr::select(county_state, date, county, state)) %>% 
  filter(!state %in% c('American Samoa', 
                       'Diamond Princess',
                       'Grand Princess',
                       'Guam',
                       'Northern Mariana Islands',
                       'Puerto Rico',
                       'Virgin Islands'))
## `summarise()` has grouped output by 'county_state'. You can override using the
## `.groups` argument.
load("Data/county.data.RData")

Climate DF

# patterns for cleaning county names

patterns <- c(' Borough| Census Area| County| Parish| city')

climate.data <- county.data %>% 
    # grab file with full state names 
  
  left_join(read.csv('Data/fips concordance.csv') %>% 
              dplyr::select(State = Alpha.code, 
                            state_full = Name)) %>%
         dplyr::mutate(county = gsub(patterns, '', Area_Name),
                       county_state = paste0(county, ', ', state_full),
                       state = state_full) %>% 
    
    dplyr::select(county,
                  county_state,
                  state,
                  avg.dec.temp = Dec.Temp.AVG...F,
                  avg.jan.temp = Jan.Temp.AVG...F,
                  avg.feb.temp = Feb.Temp.AVG...F,
                  avg.mar.temp = Mar.Temp.AVG...F,
                  avg.apr.temp = Apr.Temp.AVG...F,
                  avg.may.temp = May.Temp.AVG...F,
                  avg.jun.temp = Jun.Temp.AVG...F,
                  avg.jul.temp = Jul.Temp.AVG...F,
                  avg.aug.temp = Aug.Temp.AVG...F,
                  avg.sep.temp = Sep.Temp.AVG...F,
                  avg.oct.temp = Oct.Temp.AVG...F,
                  avg.nov.temp = Nov.Temp.AVG...F)
## Joining with `by = join_by(State)`

Mobility DF

mobility.data <- google.mobility %>% 
          dplyr::mutate(county = gsub(patterns, '', sub_region_2),
                        state = sub_region_1,
                        date = as.Date(date)) %>%
  dplyr::select(county,
                state,
                date,
                mobility.transit = transit_stations_percent_change_from_baseline,
                mobility.workplaces = workplaces_percent_change_from_baseline,
                mobility.grocery = grocery_and_pharmacy_percent_change_from_baseline)
mobility.data <- na.omit(mobility.data)
blank_counts <- sapply(mobility.data, function(col) sum(col == ""))
print(blank_counts)
##              county               state                date    mobility.transit 
##               50513                 974                  NA                   0 
## mobility.workplaces    mobility.grocery 
##                   0                   0
clean.mobility.data <- mobility.data %>% filter(county != "")
clean.mobility.data <- clean.mobility.data %>% filter(state != "")

Transit DF

# patterns for cleaning county names

patterns <- c(' Borough| Census Area| County| Parish| city')

transit.data <- county.data %>% 
    # grab file with full state names 
  
  left_join(read.csv('Data/fips concordance.csv') %>% 
              dplyr::select(State = Alpha.code, 
                            state_full = Name)) %>%
         dplyr::mutate(county = gsub(patterns, '', Area_Name),
                       county_state = paste0(county, ', ', state_full),
                       state = state_full) %>% 
    
    dplyr::select(county,
                  county_state,
                  state,
                  transit.scores = transit_scores...population.weighted.averages.aggregated.from.town.city.level.to.county) 
## Joining with `by = join_by(State)`

Collated Data

# patterns for cleaning county names

patterns <- c(' Borough| Census Area| County| Parish| city')


# covid case data 
load("Data/covid.RData")

covid.data <- covid.smooth.data_county %>% 
  
  # county data

left_join(county.data %>% 
            
            # grab file with full state names 
  
  left_join(read.csv('Data/fips concordance.csv') %>% 
              dplyr::select(State = Alpha.code, 
                            state_full = Name)) %>%
         dplyr::mutate(county = gsub(patterns, '', Area_Name),
                       county_state = paste0(county, ', ', state_full)) %>% 
    
    dplyr::select(county_state,
                  FIPS = FIPS,
                  total.population = POP_ESTIMATE_2018,
                  pop.density = Density.per.square.mile.of.land.area...Population,
                  age.65.plus = Total_age65plus,
                  avg.dec.temp = Dec.Temp.AVG...F,
                  avg.jan.temp = Jan.Temp.AVG...F,
                  avg.feb.temp = Feb.Temp.AVG...F,
                  avg.mar.temp = Mar.Temp.AVG...F,
                  avg.apr.temp = Apr.Temp.AVG...F,
                  avg.may.temp = May.Temp.AVG...F,
                  avg.jun.temp = Jun.Temp.AVG...F,
                  avg.jul.temp = Jul.Temp.AVG...F,
                  avg.aug.temp = Aug.Temp.AVG...F,
                  avg.sep.temp = Sep.Temp.AVG...F,
                  avg.oct.temp = Oct.Temp.AVG...F,
                  avg.nov.temp = Nov.Temp.AVG...F,
                  icu.beds = ICU.Beds,
                  active.physicians = Active.Physicians.per.100000.Population.2018..AAMC.,
                  active.patient.care.physicians = Total.Active.Patient.Care.Physicians.per.100000.Population.2018..AAMC.,
                  housing.density = Density.per.square.mile.of.land.area...Housing.units,
                  transit.scores = transit_scores...population.weighted.averages.aggregated.from.town.city.level.to.county)) %>%
    
    
  merge(google.mobility %>% 
          dplyr::mutate(county = gsub(patterns, '', sub_region_2),
                        county_state = paste0(county, ', ', sub_region_1),
                        date = as.Date(date),
                        mobility.transit = ifelse(is.na(transit_stations_percent_change_from_baseline), 0,
                                   transit_stations_percent_change_from_baseline)) %>% 
  
  dplyr::select(county_state,
                date,
                mobility.transit,
                mobility.workplaces = workplaces_percent_change_from_baseline,
                mobility.grocery = grocery_and_pharmacy_percent_change_from_baseline)) %>% 
  
  # convert to rates
  
  mutate(new.cases = new.cases / (total.population / 100000),
  #        icu.beds = icu.beds / (total.population / 100000),
         age.65.plus = age.65.plus / total.population)
## Joining with `by = join_by(State)`
## Joining with `by = join_by(county_state)`
  #        avg.winter.temp = (avg.dec.temp + avg.jan.temp + avg.feb.temp) / 3)


# lag new cases

new.cases.rate.lagged <- DataCombine::slide(covid.data,
                                     Var = 'new.cases', 
                                     NewVar = 'new.cases.lag',
                                     GroupVar = 'county_state', 
                                     slideBy = -1) 
## 
## Remember to order covid.data by county_state and the time variable before running.
## 
## Lagging new.cases by 1 time units.
# lag workplace mobility 

workplace.lag14 <- DataCombine::slide(covid.data,
                            Var = 'mobility.workplaces', 
                            NewVar = 'workplaces.lag14',
                            GroupVar = 'county_state', 
                            slideBy = -14)
## 
## Remember to order covid.data by county_state and the time variable before running.
## 
## Lagging mobility.workplaces by 14 time units.
## 
## 
## Warning: the following groups have 13 or fewer observations.
## No valid lag/lead can be created, so they are dropped:
## 
## Barber, Kansas
## Carter, Missouri
## Coke, Texas
## Day, South Dakota
## East Carroll, Louisiana
## Florence, Wisconsin
## Furnas, Nebraska
## Gentry, Missouri
## Gray, Kansas
## Hot Springs, Wyoming
## Kiowa, Colorado
## Lafayette, Arkansas
## Lake of the Woods, Minnesota
## Luce, Michigan
## Lyman, South Dakota
## Nome, Alaska
## Ontonagon, Michigan
## Ouray, Colorado
## Pondera, Montana
## Real, Texas
## Scotland, Missouri
## Scott, Illinois
## Stevens, Kansas
## Tripp, South Dakota
## Washington, Colorado
## Wheeler, Texas
## Wirt, West Virginia
## Wolfe, Kentucky
# combine 

covid.data <- covid.data %>% 
  
  merge(new.cases.rate.lagged %>% 
            dplyr::select(county_state, date, 
                        new.cases.lag)) %>% 
  
    merge(workplace.lag14 %>% 
            dplyr::select(county_state, date, 
                        workplaces.lag14))


#covid.data <- covid.data %>%
 # merge(confirmed.cases.data %>%
  #        dplyr::rename(state = Province_State,
   #                     county = Admin2) %>%
    #      dplyr::select(-Lat, -Long_))
colnames(covid.data)
##  [1] "county_state"                   "date"                          
##  [3] "new.cases"                      "cases_14days"                  
##  [5] "county"                         "state"                         
##  [7] "FIPS"                           "total.population"              
##  [9] "pop.density"                    "age.65.plus"                   
## [11] "avg.dec.temp"                   "avg.jan.temp"                  
## [13] "avg.feb.temp"                   "avg.mar.temp"                  
## [15] "avg.apr.temp"                   "avg.may.temp"                  
## [17] "avg.jun.temp"                   "avg.jul.temp"                  
## [19] "avg.aug.temp"                   "avg.sep.temp"                  
## [21] "avg.oct.temp"                   "avg.nov.temp"                  
## [23] "icu.beds"                       "active.physicians"             
## [25] "active.patient.care.physicians" "housing.density"               
## [27] "transit.scores"                 "mobility.transit"              
## [29] "mobility.workplaces"            "mobility.grocery"              
## [31] "new.cases.lag"                  "workplaces.lag14"

Add shape files for any mapping tasks later

us.shape.1 <- 
  st_read("Data/US shapefiles/cb_2016_us_county_20m.shp")
## Reading layer `cb_2016_us_county_20m' from data source 
##   `C:\Users\emily\Documents\GitHub\DATA4207-GroupProject\group-project-3\Data\US shapefiles\cb_2016_us_county_20m.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 3220 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -179.1743 ymin: 17.91377 xmax: 179.7739 ymax: 71.35256
## Geodetic CRS:  NAD83
us.shape.1 <- us.shape.1 %>%
  shift_geometry() #This function moves alaska into a easier area to interpret
fip.concordance <- read.csv("Data/fips concordance.csv")
fip.concordance <- read.csv("Data/fips concordance.csv")

us.shape.2 <- us.shape.1 %>% 
  merge(fip.concordance %>%
          mutate(STATEFP =  ifelse(Numeric.code >= 0 & Numeric.code <= 9, 
                             paste0(0 , Numeric.code), Numeric.code)) %>%
          dplyr::rename(state = Alpha.code))


us.shape.2$county <- paste0(us.shape.2$NAME, ", ", us.shape.2$state)

us.shape.2$county <- gsub("Doña Ana, NM", "Dona Ana, NM", us.shape.2$county)
us.shape.2$county <- gsub("LaSalle, LA", "La Salle, LA", us.shape.2$county)
us.shape.2$county <- gsub("Oglala Lakota, SD", "Oglala, SD", us.shape.2$county)
# Summary statistics for age over 65
summary(covid.data$age.65.plus)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.048   0.156   0.181   0.184   0.208   0.576   23767
# Correlation between age over 65 and new cases
cor(covid.data$age.65.plus, covid.data$new.cases, use = "complete.obs")
## [1] -0.03972087
# Scatter plot
plot(covid.data$new.cases, covid.data$age.65.plus, ylab = "Age Over 65", xlab = "New COVID-19 Cases", main = "Age Over 65 vs. New COVID-19 Cases")

specific_date <- as.Date("2020-07-20    ")  
filtered_data <- covid.data %>%
  filter(date == specific_date)
# Summary statistics for population density
summary(covid.data$pop.density)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.1    32.4    72.4   650.6   274.5 69468.4    2486
# Correlation between population density and new cases
cor(covid.data$pop.density, covid.data$new.cases, use = "complete.obs")
## [1] 0.04094492
# Scatter plot
plot(filtered_data$new.cases, filtered_data$pop.density, ylab = "Population Density", xlab = "New COVID-19 Cases", main = "Population Density vs. New COVID-19 Cases")

# Summary statistics for active physicians per 100,000
summary(covid.data$active.patient.care.physicians)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   174.8   202.9   231.8   229.9   240.7   353.7    1919
# Correlation between active physicians and new cases
cor(covid.data$active.patient.care.physicians, covid.data$new.cases, use = "complete.obs")
## [1] -0.002778664
# Scatter plot
plot(covid.data$new.cases, covid.data$active.patient.care.physicians, ylab = "Active Physicians per 100,000", xlab = "New COVID-19 Cases", main = "Physicians vs. New COVID-19 Cases")

Descriptive Analysis

Transit

na_count_per_column <- transit.data %>%
  summarise_all(~ sum(is.na(.)))
print(na_count_per_column)
##   county county_state state transit.scores
## 1      0            0     1            212
clean.transit.data <- na.omit(transit.data)

# standardise
clean.transit.data <- clean.transit.data %>%
  mutate(z.transit = (transit.scores - mean(transit.scores, na.rm=T)) / 
           sd(transit.scores, na.rm=T))
ggplot(clean.transit.data, aes(x = z.transit)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Standardised transit scores in America",
       x = "Transit Scores",
       y = "Frequency") +
  bbplot::bbc_style()+
  theme(plot.title = element_text(size = 12),
        axis.title.x = element_text(size = 8), # Adjust the x-axis label size
    axis.title.y = element_text(size = 8))

us.shape.2 <- us.shape.2 %>%
  select(-county, -Name, -Status, -COUNTYFP, -COUNTYNS, -STATEFP, -AFFGEOID, -LSAD, -ALAND, -AWATER, -Numeric.code, -Status)%>%
  rename(county = NAME)

transit.map <- full_join(us.shape.2,
                         clean.transit.data %>% 
                           select(county, state, z.transit), 
                         by = "county") %>%
              st_as_sf()
ggplot(transit.map) +
  geom_sf(mapping = aes(fill = z.transit), colour = NA, inherit.aes = FALSE) +
  scale_fill_gradient2(
    low = "#2c7bb6",           # Blue
    mid = "#ffffbf",           # Yellow (midpoint)
    high = "#d7191c",          # Red
    midpoint = mean(transit.map$z.transit, na.rm = TRUE),
    limits = c(min(transit.map$z.transit, na.rm = TRUE),
               max(transit.map$z.transit, na.rm = TRUE)),
    label = scales::comma,
    na.value = "grey"
  ) +
  labs(title = 'Standardised Transit Distribution') +
  guides(fill = guide_legend(keyheight = 0.3)) +
  theme_void() +
  bbplot::bbc_style() +
  theme(
    plot.title = element_text(size = 16),         # Make the title larger
    legend.position = 'bottom',
    legend.title = element_blank(),
    legend.text = element_text(size = 10),        # Make legend text smaller
    axis.title = element_text(size = 10),         # Make axis labels smaller
    axis.text = element_text(size = 8)            # Make axis text smaller
  )

Mobility

# install.packages("zoo")
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
summarise.mobility.data <- clean.mobility.data %>%
  group_by(date) %>%
  summarise(mobility.transit = mean(mobility.transit, na.rm = TRUE),
            mobility.grocery = mean(mobility.grocery, na.rm = TRUE),
            mobility.workplaces = mean(mobility.workplaces, na.rm = TRUE))


# Mobility Values: Each value in the dataset represents a percentage change from a baseline.
# Rolling Average: The 7-day rolling average smooths these daily percentage changes to show a trend over time.
# Percentage Representation: The rolling average remains a percentage because it’s derived from percentage values.

# Calculate rolling 7 day average
grocery.mobility.data <- summarise.mobility.data %>%
  arrange(date) %>%
  mutate(rolling_avg = zoo::rollmean(mobility.grocery, 7, fill = NA, align = "right"))

transit.mobility.data <- summarise.mobility.data %>%
  arrange(date) %>%
  mutate(rolling_avg = zoo::rollmean(mobility.transit, 7, fill = NA, align = "right"))


workplace.mobility.data <- summarise.mobility.data %>%
  arrange(date) %>%
  mutate(rolling_avg = zoo::rollmean(mobility.workplaces, 7, fill = NA, align = "right"))
ggplot(grocery.mobility.data, aes(x = date, y = rolling_avg)) +
  geom_line(color = "blue") +
  labs(title = "Changes in Visitors to Grocery and Pharmacy Stores Over Time",
       x = "Date",
       y = "7-day Rolling Average of Mobility (%)") +
  bbplot::bbc_style()+
  theme(plot.title = element_text(size = 12),
        axis.title.x = element_text(size = 8), # Adjust the x-axis label size
    axis.title.y = element_text(size = 8))

ggplot(grocery.mobility.data, aes(x = date, y = rolling_avg)) +
  geom_line(color = "blue", size = 1) +  # Thicker line for better visibility
  # geom_point(color = "red", size = 1, alpha = 0.7) +  # Adding points for data
  labs(title = "Changes in Visitors to Grocery and Pharmacy Stores Over Time",
       x = "Date",
       y = "7-day Rolling Average of Mobility (%)") +
  bbplot::bbc_style() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),  # Larger, bold title
    axis.title.x = element_text(size = 12),  # Larger x-axis label
    axis.title.y = element_text(size = 12),  # Larger y-axis label
    axis.text.x = element_text(size = 10, angle = 45, hjust = 1),  # Rotate x-axis labels for better readability
    axis.text.y = element_text(size = 10)
  ) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "1 month") +  # Format x-axis dates
  geom_vline(xintercept = as.numeric(as.Date(c("2020-03-11", "2020-12-25"))),  # Add vertical lines for significant dates
             linetype = "dashed", color = "grey50") +
  annotate("text", x = as.Date("2020-03-11"), y = max(grocery.mobility.data$rolling_avg, na.rm = TRUE)* -0.5,
           label = "Pandemic Declared", angle = 90, vjust = -0.5, size = 3, color = "grey50") +
  annotate("text", x = as.Date("2020-12-25"), y = max(grocery.mobility.data$rolling_avg, na.rm = TRUE)* 0.8,
           label = "Christmas", angle = 90, vjust = -0.5, size = 3, color = "grey50")

ggplot(transit.mobility.data, aes(x = date, y = rolling_avg)) +
  geom_line(color = "blue") +
  labs(title = "Changes in Visitors to Transit Over Time",
       x = "Date",
       y = "7-day Rolling Average of Mobility (%)") +
  theme_minimal()

ggplot(workplace.mobility.data, aes(x = date, y = rolling_avg)) +
  geom_line(color = "blue") +
  labs(title = "Changes in Visitors to Workplaces Over Time",
       x = "Date",
       y = "7-day Rolling Average of Mobility (%)") +
  theme_minimal()

# Add a type column to each dataframe
grocery.mobility.data <- grocery.mobility.data %>%
  mutate(type = "Grocery and Pharmacy")

transit.mobility.data <- transit.mobility.data %>%
  mutate(type = "Transit")

workplace.mobility.data <- workplace.mobility.data %>%
  mutate(type = "Workplace")

# Combine the dataframes into one
combined_mobility_data <- bind_rows(grocery.mobility.data, transit.mobility.data, workplace.mobility.data)

# Create the ggplot
p <- ggplot(combined_mobility_data, aes(x = date, y = rolling_avg, color = type)) +
  geom_line() +
  labs(title = "Changes in Mobility Over Time",
       x = "Date",
       y = "7-day Rolling Average of Mobility (%)",
       color = "Mobility Type") +
  theme_minimal()

# Convert the ggplot to an interactive plotly plot
interactive_plot <- ggplotly(p)

# Display the interactive plot
interactive_plot

Climate data

# Changing the climate data from a wide format to long format:
climate.data <- climate.data %>%
  dplyr::select(county, 
                state,
                avg.temp.jan = avg.jan.temp,
                avg.temp.feb = avg.feb.temp,
                avg.temp.mar = avg.mar.temp,
                avg.temp.apr = avg.apr.temp,
                avg.temp.may = avg.may.temp,
                avg.temp.jun = avg.jun.temp,
                avg.temp.jul = avg.jul.temp,
                avg.temp.aug = avg.aug.temp,
                avg.temp.sep = avg.sep.temp,
                avg.temp.oct = avg.oct.temp,
                avg.temp.nov = avg.nov.temp,
                avg.temp.dec = avg.dec.temp)


climate.data.long <- climate.data %>%
  pivot_longer(
    cols = starts_with("avg.temp."),
    names_to = "month",
    values_to = "temp"
  )

climate.data.long <- climate.data.long %>%
  mutate(month = case_when(
    month == "avg.temp.jan" ~ "Jan",
    month == "avg.temp.feb" ~ "Feb",
    month == "avg.temp.mar" ~ "Mar",
    month == "avg.temp.apr" ~ "Apr",
    month == "avg.temp.may" ~ "May",
    month == "avg.temp.jun" ~ "Jun",
    month == "avg.temp.jul" ~ "Jul",
    month == "avg.temp.aug" ~ "Aug",
    month == "avg.temp.sep" ~ "Sep",
    month == "avg.temp.oct" ~ "Oct",
    month == "avg.temp.nov" ~ "Nov",
    month == "avg.temp.dec" ~ "Dec",
    TRUE ~ month
  ))
# us.shape.2 <- us.shape.2 %>%
#   select(-county, -Name, -Status, -COUNTYFP, -COUNTYNS, -STATEFP, -AFFGEOID, -GEOID, -LSAD, -ALAND, -AWATER, -Numeric.code, -Status)%>%
#   rename(county = NAME)

# Don't want to include missing data since its not useful - hence inner join 
climate.map <- full_join(us.shape.2,
                         climate.data.long %>% 
                           select(county, state, month, temp), 
                         by = "county") %>%
              st_as_sf()
 # To acknolwedge its a shape file
climate.state <- climate.data.long %>%
  group_by(state, month)%>%
  summarise(avg_temp = mean(temp, na.rm= TRUE, .groups = 'drop'))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
ggplot(climate.state, aes(x = month, y = avg_temp, group = state, color = state)) +
  geom_line() +
  geom_point() +
  labs(title = "Average Temperature Over Each Month Per State",
       x = "Month",
       y = "Average Temperature") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 12),  # Set title size
    axis.title.x = element_text(size = 10), # Set x-axis label size
    axis.title.y = element_text(size = 10), # Set y-axis label size
    legend.position = "bottom",  # Move legend below plot
    legend.title = element_blank(),  # Remove legend title
    legend.text = element_text(size = 8)  # Set legend text size
  ) +
  guides(color = guide_legend(nrow = 5))  # Set legend to one row

# Create the plot using ggplot
p <- ggplot(climate.state, aes(x = month, y = avg_temp, group = state, color = state)) +
  geom_line() +
  geom_point() +
  labs(title = "Average Temperature Over Each Month Per State",
       x = "Month",
       y = "Average Temperature") +
  theme_minimal()

# Convert the ggplot object to plotly
p <- ggplotly(p, tooltip = c("x", "y", "color"))

# Show the interactive plot
p
month_order <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")

# Convert the month variable to a factor with the desired order
climate.map$month <- factor(climate.map$month, levels = month_order)

ggplot(climate.map) +
  geom_sf(mapping = aes(fill = temp), colour = NA, inherit.aes = FALSE) +
  scale_fill_gradient2(low = "#0072B2",
                       high = "red",
                       midpoint = mean(climate.map$temp, na.rm = TRUE),
                       limits = c(min(climate.map$temp, na.rm = TRUE),
                                  max(climate.map$temp, na.rm = TRUE)),
                       label = scales::comma,
                       na.value = "grey") +
  labs(title = 'Average Climate distribution') +
  guides(fill = guide_legend(keyheight = 0.3)) +
  theme_void() +
  theme(plot.title = element_text(size = 7),
        legend.position = 'bottom',
        legend.title = element_blank()) +
  facet_wrap(~month) # This line tells ggplot to create separate panels for each unique value in the month column. Each panel will display the map for one month.

Sarah’s stuff :)

# Check column names and data types


# Ensure FIPS column is correctly named and of the same type in both data frames
covid.data <- covid.data %>%
  mutate(FIPS = as.character(FIPS))

us.shape.2 <- us.shape.2 %>%
  mutate(GEOID = as.character(GEOID))
# Function to perform join in chunks
join_in_chunks <- function(shape_data, covid_data, chunk_size = 1000) {
  n <- nrow(shape_data)
  result <- NULL
  
  for (i in seq(1, n, by = chunk_size)) {
    chunk <- shape_data[i:min(i + chunk_size - 1, n), ]
    chunk_result <- chunk %>%
      left_join(covid_data, by = c("GEOID" = "FIPS"))
    
    if (is.null(result)) {
      result <- chunk_result
    } else {
      result <- bind_rows(result, chunk_result)
    }
  }
  
  return(result)
}

# Perform join in chunks
county_data <- join_in_chunks(us.shape.2, covid.data, chunk_size = 1000)

county_data <- st_transform(county_data, crs = st_crs(5070))
# Function to plot in chunks
county_data <- st_simplify(county_data, dTolerance = 1000)

# Function to plot in chunks
plot_in_chunks <- function(data, chunk_size = 1000) {
  n <- nrow(data)
  
  for (i in seq(1, n, by = chunk_size)) {
    chunk <- data[i:min(i + chunk_size - 1, n), ]
    
    # Plot Population Density
    p<- ggplot(data = chunk) +
      geom_sf(aes(fill = pop_density), color = NA) +
      scale_fill_viridis_c(option = "viridis") +
      theme_minimal() +
      labs(title = paste("Population Density by County (Chunk", i, "to", min(i + chunk_size - 1, n), ")"),
           fill = "Density (per sq. mile)") 
    p
  }
}

# Call the function to plot in chunks
plot_in_chunks(county_data, chunk_size = 1000)
# Plot Age Over 65
ggplot(data = county_data) +
  geom_sf(aes(fill = age.65.plus), color = NA) +
  scale_fill_viridis_c(option = "viridis") +
  theme_minimal() +
  labs(title = "Proportion of Population Age 65+ by County",
       fill = "Proportion")

# Plot Active Physicians per 100,000
ggplot(data = county_data) +
  geom_sf(aes(fill = active.patient.care.physicians), color = NA) +
  scale_fill_viridis_c(option = "viridis") +
  theme_minimal() +
  labs(title = "Active Physicians per 100,000 by County",
       fill = "Physicians/100k")

Model

“county_state” “date” “new.cases”
[4] “cases_14days” “county” “state”
[7] “FIPS” “total.population” “pop.density”
[10] “age.65.plus” “avg.dec.temp” “avg.jan.temp”
[13] “avg.feb.temp” “avg.mar.temp” “avg.apr.temp”
[16] “avg.may.temp” “avg.jun.temp” “avg.jul.temp”
[19] “avg.aug.temp” “avg.sep.temp” “avg.oct.temp”
[22] “avg.nov.temp” “icu.beds” “active.physicians”
[25] “active.patient.care.physicians” “housing.density” “transit.scores”
[28] “mobility.transit” “mobility.workplaces” “mobility.grocery”
[31] “new.cases.lag” “workplaces.lag14”

model2 <- lm(cases_14days ~ avg.jan.temp + avg.feb.temp + avg.mar.temp + avg.apr.temp + avg.may.temp + avg.jun.temp + avg.jul.temp + avg.aug.temp + avg.sep.temp + avg.oct.temp + avg.nov.temp + avg.dec.temp , covid.data)

summary(model2)
## 
## Call:
## lm(formula = cases_14days ~ avg.jan.temp + avg.feb.temp + avg.mar.temp + 
##     avg.apr.temp + avg.may.temp + avg.jun.temp + avg.jul.temp + 
##     avg.aug.temp + avg.sep.temp + avg.oct.temp + avg.nov.temp + 
##     avg.dec.temp, data = covid.data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##   -788    -56    -29      7  38704 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -397.00403    4.06516  -97.66   <2e-16 ***
## avg.jan.temp    1.49125    0.11727   12.72   <2e-16 ***
## avg.feb.temp   -7.73632    0.08672  -89.21   <2e-16 ***
## avg.mar.temp    6.24124    0.15992   39.03   <2e-16 ***
## avg.apr.temp    4.16294    0.17651   23.59   <2e-16 ***
## avg.may.temp   -5.82414    0.11941  -48.77   <2e-16 ***
## avg.jun.temp  -24.30396    0.18125 -134.09   <2e-16 ***
## avg.jul.temp   26.63539    0.18637  142.92   <2e-16 ***
## avg.aug.temp   -8.48234    0.16586  -51.14   <2e-16 ***
## avg.sep.temp    2.30451    0.12589   18.31   <2e-16 ***
## avg.oct.temp    8.02431    0.11472   69.95   <2e-16 ***
## avg.nov.temp    9.33308    0.17684   52.78   <2e-16 ***
## avg.dec.temp   -3.31985    0.12168  -27.28   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 240.6 on 2379966 degrees of freedom
##   (10961 observations deleted due to missingness)
## Multiple R-squared:  0.03495,    Adjusted R-squared:  0.03495 
## F-statistic:  7183 on 12 and 2379966 DF,  p-value: < 2.2e-16
model3 <- lm(cases_14days ~mobility.transit +mobility.grocery +mobility.workplaces + avg.jan.temp + avg.feb.temp + avg.mar.temp + avg.apr.temp + avg.may.temp + avg.jun.temp + avg.jul.temp + avg.aug.temp + avg.sep.temp + avg.oct.temp + avg.nov.temp + avg.dec.temp, covid.data)
summary(model3)
## 
## Call:
## lm(formula = cases_14days ~ mobility.transit + mobility.grocery + 
##     mobility.workplaces + avg.jan.temp + avg.feb.temp + avg.mar.temp + 
##     avg.apr.temp + avg.may.temp + avg.jun.temp + avg.jul.temp + 
##     avg.aug.temp + avg.sep.temp + avg.oct.temp + avg.nov.temp + 
##     avg.dec.temp, data = covid.data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##   -799    -74    -30     18  38597 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -561.34293    6.63602  -84.59   <2e-16 ***
## mobility.transit      -1.86760    0.01204 -155.09   <2e-16 ***
## mobility.grocery      -0.67687    0.01799  -37.63   <2e-16 ***
## mobility.workplaces   -0.91827    0.01813  -50.65   <2e-16 ***
## avg.jan.temp           4.81217    0.20528   23.44   <2e-16 ***
## avg.feb.temp         -10.46979    0.15244  -68.68   <2e-16 ***
## avg.mar.temp           8.94395    0.26505   33.74   <2e-16 ***
## avg.apr.temp          -3.75114    0.30867  -12.15   <2e-16 ***
## avg.may.temp          -6.13216    0.19875  -30.85   <2e-16 ***
## avg.jun.temp         -24.58477    0.29336  -83.81   <2e-16 ***
## avg.jul.temp          31.89003    0.32071   99.44   <2e-16 ***
## avg.aug.temp         -13.60139    0.30398  -44.74   <2e-16 ***
## avg.sep.temp          10.87912    0.22105   49.22   <2e-16 ***
## avg.oct.temp           4.75981    0.19643   24.23   <2e-16 ***
## avg.nov.temp          10.11356    0.31310   32.30   <2e-16 ***
## avg.dec.temp          -2.65084    0.21474  -12.34   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 300 on 1443118 degrees of freedom
##   (947806 observations deleted due to missingness)
## Multiple R-squared:  0.06917,    Adjusted R-squared:  0.06916 
## F-statistic:  7149 on 15 and 1443118 DF,  p-value: < 2.2e-16
library(lmtest)

# Durbin-Watson test
dwtest(model3)
## 
##  Durbin-Watson test
## 
## data:  model3
## DW = 0.016454, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
# Plot residuals vs fitted values
plot(fitted(model3), residuals(model3), main = "Residuals vs Fitted")
abline(h = 0, col = "red")

# Q-Q plot
qqnorm(residuals(model3))
qqline(residuals(model3), col = "red")

Results and data journalism project explaining the spread of COVID-19 in the US. Why has it impacted some

areas more than others?

COVID-19